In [1]:

    
import numpy as np
import pandas as pd

Object Creation

Creating Series: 1D ndarray



In [2]:

    
print(pd.Series( [1, 5, 2, 3, 6, 4] ) )









    



0    1
1    5
2    2
3    3
4    6
5    4
dtype: int64

Checking a value in Series index and values



In [3]:

    
s = pd.Series(range(5), index=list('abcde'))
print('b' in s) # checks index values
print(s.isin([2])) # checks in values









    



True
a    False
b    False
c     True
d    False
e    False
dtype: bool



In [4]:

    
print(s)
s = s.reindex(list('afg'))
s # due to NaN, whole series dtype changed to float. To escape from such instances, make the series object dtype initially.









    



a    0
b    1
c    2
d    3
e    4
dtype: int32






    Out[4]:





a    0.0
f    NaN
g    NaN
dtype: float64



In [5]:

    
s_obj = pd.Series(range(5), index=list('abcde'), dtype=np.object)
print(s_obj)
s_obj = s_obj.reindex(list('afg'))
s_obj # No change in the dtype









    



a    0
b    1
c    2
d    3
e    4
dtype: object






    Out[5]:





a      0
f    NaN
g    NaN
dtype: object

Creating DataFrame



In [6]:

    
# 1. Using list of dictionary
lst = [{"C1": 1, "C2": 2},
      {"C1": 5, "C2": 10, "C3": 20}]



In [7]:

    
# Observe NaN       
print(pd.DataFrame(lst, index = ["R1", "R2"]))









    



    C1  C2    C3
R1   1   2   NaN
R2   5  10  20.0



In [8]:

    
# 2. Using list (Commonly used)
lst = {"C1": ["1", "3"],
      "C2": ["2","4"]}
print( pd.DataFrame(lst, index = ["R1", "R2"]) )



In [9]:

    
# Defining Column name within df function
print(pd.DataFrame(np.random.randn(2,2),
                  index = list('pq'),
                   columns = list('ab')))









    



          a         b
p  0.563066  0.494843
q -0.051123  1.303799

Data types



In [10]:

    
df = pd.DataFrame({'A': [10., 20.],
                  'B': "text",
                  'C': [2,60],
                  'D': 3+9j})
print(df)









    



      A     B   C       D
0  10.0  text   2  (3+9j)
1  20.0  text  60  (3+9j)



In [11]:

    
# Observe final dtype as OBJECT to preserve every
# element's data type information.
print(df.dtypes)









    



A       float64
B        object
C         int64
D    complex128
dtype: object



In [12]:

    
print(df.info()) # memory usage doesn't include object dtypes columns









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
A    2 non-null float64
B    2 non-null object
C    2 non-null int64
D    2 non-null complex128
dtypes: complex128(1), float64(1), int64(1), object(1)
memory usage: 160.0+ bytes
None



In [13]:

    
df.info(memory_usage='deep') # with deep even object dtypes are included









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
A    2 non-null float64
B    2 non-null object
C    2 non-null int64
D    2 non-null complex128
dtypes: complex128(1), float64(1), int64(1), object(1)
memory usage: 286.0 bytes

Viewing Data



In [14]:

    
print(pd.DataFrame(np.random.randn(20,2)).head())









    



          0         1
0  2.076125 -0.371193
1 -0.620316 -0.521640
2 -0.555493 -0.146482
3 -0.614715  1.007617
4  0.219642 -0.311603



In [15]:

    
print(pd.DataFrame(np.random.randn(20,2)).tail(5))









    



           0         1
15  0.938823 -1.065879
16  1.284274  0.952037
17  1.633314 -0.249550
18 -0.881654 -1.234461
19  1.162749  1.542099



In [16]:

    
df = pd.DataFrame({'A': [10., 20.],
                  'B': "text",
                  'C': [2,60],
                  'D': 3+9j}, index = list('PQ'))



In [17]:

    
print((df.index))   #index of rows









    



Index(['P', 'Q'], dtype='object')



In [18]:

    
print(df.columns)   # index of columns









    



Index(['A', 'B', 'C', 'D'], dtype='object')



In [19]:

    
print(df.values)    # display values of df









    



[[10.0 'text' 2 (3+9j)]
 [20.0 'text' 60 (3+9j)]]

DataFrame Attributes



In [20]:

    
df = pd.DataFrame({'A': [1,4,7],
                   'B': [2,5,8],
                   'C': [3,6,9]},
                   index = list('PQR'))
print(df)



In [21]:

    
# Transpose
print(df.T)



In [22]:

    
# shape, size and dimensions
print(df.shape, df.size, df.ndim)

DataFrame Methods



In [23]:

    
df1 = pd.DataFrame({'A': [1,4,7],
                   'B': [2,5,8],
                   'C': [3,6,9]},
                   index = list('PQR'))
df_temp = df1
df2 = pd.DataFrame({'A': [-1,4,7],
                   'B': [2,-5,8],
                   'C': [3,6,-9]},
                   index = list('PQR'))

df3 = pd.DataFrame({'A': [1,-4,-7],
                   'B': [-2,5,-8],
                   'C': [-3,-6,9]},
                   index = list('STU'))

Addition | Subtraction



In [24]:

    
print(df1.add(df2))









    



    A   B   C
P   0   4   6
Q   8   0  12
R  14  16   0



In [25]:

    
print(df1.sub(df3))









    



    A   B   C
P NaN NaN NaN
Q NaN NaN NaN
R NaN NaN NaN
S NaN NaN NaN
T NaN NaN NaN
U NaN NaN NaN

Appending 2 dfs



In [26]:

    
print(df_temp.append(df2))

Applying a function



In [27]:

    
print(df3.apply(np.abs, axis = 1))



In [28]:

    
df3



In [29]:

    
df3.apply(np.sum, axis = 0)









    Out[29]:





A   -10
B    -5
C     0
dtype: int64

Converting frame to its numpy array representation



In [30]:

    
print(type(df3.as_matrix()))









    



<class 'numpy.ndarray'>

Computing non-nan values



In [31]:

    
df1



In [32]:

    
print(df1.count())  #by default -> col









    



A    3
B    3
C    3
dtype: int64



In [33]:

    
print(df1.count(axis = 1))  # row









    



P    3
Q    3
R    3
dtype: int64



In [34]:

    
print(df1.sub(df3).count())   # nan in all col









    



A    0
B    0
C    0
dtype: int64

Handling NULL values



In [35]:

    
df = pd.DataFrame(np.nan,columns = list('AB'),
                 index = list('CD'))
print(df)









    



    A   B
C NaN NaN
D NaN NaN



In [36]:

    
print(df.isnull())









    



      A     B
C  True  True
D  True  True

Filling NaNs



In [37]:

    
print(df1.sub(df3).fillna(5))









    



     A    B    C
P  5.0  5.0  5.0
Q  5.0  5.0  5.0
R  5.0  5.0  5.0
S  5.0  5.0  5.0
T  5.0  5.0  5.0
U  5.0  5.0  5.0

Drop NaNs



In [38]:

    
df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                  [3, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 5]],
                  columns=list('ABCD'))
print(df)









    



     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  NaN NaN  5



In [39]:

    
print(df.dropna(axis = 1, how = 'all'))









    



     A    B  D
0  NaN  2.0  0
1  3.0  4.0  1
2  NaN  NaN  5



In [40]:

    
print(df.dropna(axis = 1, how='any'))

Handling Duplicates



In [41]:

    
df = pd.DataFrame([[1, 2, 3], 
                  [1, 2, 3], 
                  [4, 5, 6],
                  [1, 2, 3]], columns =list('ABC'))
# representing which rows are duplicated
print(df.duplicated())









    



0    False
1     True
2    False
3     True
dtype: bool

Computing PRESENCE of VALUES in a df



In [42]:

    
df = pd.DataFrame([["hi", 2, 3], 
                  [1, 20, 3+3j], 
                  [4, "world", 6]], 
                   columns =list('ABC'))
print(df)









    



    A      B       C
0  hi      2  (3+0j)
1   1     20  (3+3j)
2   4  world  (6+0j)



In [43]:

    
print(df.isin( [20, 6+0j, 'hi'] ))









    



       A      B      C
0   True  False  False
1  False   True  False
2  False  False   True

Selection

Selection by Label



In [44]:

    
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]], 
                  index = list('ABC'),
                  columns = list('DE'))
print(df)



In [45]:

    
print(df.loc[['A','C'],:])



In [46]:

    
print(df.loc[:,'E'])









    



A    12
B    54
C    32
Name: E, dtype: int64



In [47]:

    
print(df.loc['B', 'D'])

Selection by Position



In [48]:

    
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]])

print(df)



In [49]:

    
print(df.iloc[[0,2],:])



In [50]:

    
print(df.iloc[:,0:1])



In [51]:

    
print(df.iloc[:,:])

RegEx



In [52]:

    
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]], 
                  index = ['one','two','three'],
                  columns = ['col1', 'col2'])

print(df)









    



       col1  col2
one      15    12
two      33    54
three    10    32



In [53]:

    
print(df.filter(regex = 'e$', axis = 0))









    



       col1  col2
one      15    12
three    10    32



In [54]:

    
print(df.filter(regex = '^c', axis = 1))









    



       col1  col2
one      15    12
two      33    54
three    10    32

Boolean indexing



In [55]:

    
df = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]])

df[df < 15] = 10
df

Operations



In [56]:

    
df1 = pd.DataFrame([[15, 12],
                  [33, 54],
                  [10, 32]],
                  columns = list('AB')) 
df2 = pd.DataFrame([[15, 12, -3],
                  [33, 54, 21],
                  [10, 32, 22]],
                  columns = list('ABC'))
df3 = pd.DataFrame([[10, 1, 3],
                  [33, -54, 2],
                  [10, 0.32, 2]],
                  columns = list('ABC'))

print(df1)



In [57]:

    
print(df1.eval('B - A'))









    



0    -3
1    21
2    22
dtype: int64



In [58]:

    
df1.eval('C = B - A', inplace=False)   # < C = > is optional    |  inplace=True doesn't return output, rather changes orig.



In [59]:

    
loc_var = np.arange(3)
df1.eval('C = A + @loc_var', inplace=False)   # @ allows to use external variables



In [60]:

    
print(df1)

Comparison Operations



In [61]:

    
print(df1.equals(df2))  # On broader scale









    



False



In [62]:

    
print(df2.eq(df3))









    



       A      B      C
0  False  False  False
1   True  False  False
2   True  False  False



In [63]:

    
print(df2.le(df3))









    



       A      B      C
0  False  False   True
1   True  False  False
2   True  False  False



In [64]:

    
print(df2.ne(df3))









    



       A     B     C
0   True  True  True
1  False  True  True
2  False  True  True

Merging dfs

Concatenating at bottom



In [65]:

    
print(pd.concat( [df2,df3], axis = 0 ))









    



    A      B   C
0  15  12.00  -3
1  33  54.00  21
2  10  32.00  22
0  10   1.00   3
1  33 -54.00   2
2  10   0.32   2

Concatenating at side



In [66]:

    
print(pd.concat( [df2,df3], axis = 1 ))









    



    A   B   C   A      B  C
0  15  12  -3  10   1.00  3
1  33  54  21  33 -54.00  2
2  10  32  22  10   0.32  2

Reshaping & Pivot tables

Pivot tables



In [67]:

    
df = pd.DataFrame([
           ['PID0', 'Gold', '1$', '2£'],
           ['PID0', 'Bronze', '2$', '4£'],
           ['PID1', 'Gold', '3$', '6£'],
           ['PID1', 'Silver', '4$', '8£'],
           ], columns = ['PID', 'CType', 
               'USD', 'PND'])
                 
print(df)









    



    PID   CType USD PND
0  PID0    Gold  1$  2£
1  PID0  Bronze  2$  4£
2  PID1    Gold  3$  6£
3  PID1  Silver  4$  8£



In [68]:

    
print(df.pivot(index = 'PID', columns = 'CType'))









    



         USD                PND            
CType Bronze Gold Silver Bronze Gold Silver
PID                                        
PID0      2$   1$   None     4£   2£   None
PID1    None   3$     4$   None   6£     8£



In [69]:

    
print(df.pivot(index = 'PID', columns = 'CType',
              values = 'USD'))









    



CType Bronze Gold Silver
PID                     
PID0      2$   1$   None
PID1    None   3$     4$



In [70]:

    
df = pd.DataFrame([
           ['PID0', 'Gold', '1$'],
           ['PID0', 'Bronze', '2$'],
           ['PID0', 'Gold', '3$'],     #Duplicate
           ['PID1', 'Silver', '4$'],
           ], columns = ['PID', 'CType', 
               'USD'])



In [71]:

    
# Ambiguity in saving value to PID0_Gold 
# on values 1$ and 3$

print(df)









    



    PID   CType USD
0  PID0    Gold  1$
1  PID0  Bronze  2$
2  PID0    Gold  3$
3  PID1  Silver  4$



In [72]:

    
print(df.pivot_table(index = 'PID', 
                    columns = 'CType',
                    aggfunc = np.max))









    



         USD             
CType Bronze  Gold Silver
PID                      
PID0      2$    3$   None
PID1    None  None     4$

Reshaping



In [73]:

    
multi_ind = pd.MultiIndex.from_tuples(
                                     [('IND','R1'),
                                      ('IND','R2'),
                                      ('US','R1'),
                                      ('US','R2')],
                                      names = [
                                               '1st',
                                               '2nd'])
df = pd.DataFrame(np.random.randn(4,2),
                 index = multi_ind,
                 columns = ['C1', 'C2'])

print(df)









    



               C1        C2
1st 2nd                    
IND R1   0.609643  0.767526
    R2   0.212994  0.589823
US  R1  -0.745894 -0.024052
    R2  -0.435979 -0.980595



In [74]:

    
print(df.stack())   # increase in length









    



1st  2nd    
IND  R1   C1    0.609643
          C2    0.767526
     R2   C1    0.212994
          C2    0.589823
US   R1   C1   -0.745894
          C2   -0.024052
     R2   C1   -0.435979
          C2   -0.980595
dtype: float64



In [75]:

    
print(df.unstack()) # increase in width









    



           C1                  C2          
2nd        R1        R2        R1        R2
1st                                        
IND  0.609643  0.212994  0.767526  0.589823
US  -0.745894 -0.435979 -0.024052 -0.980595

Grouping



In [76]:

    
df = pd.DataFrame([["B", 1],
                  ["B", 2],
                  ["A", 3],
                  ["A", 4]], columns = list('XY'))

print(df)



In [77]:

    
print(df.groupby(['X']).sum())



In [78]:

    
print(df.groupby(['X'], sort = False).sum())



In [79]:

    
df = pd.DataFrame({'A' : list('ppppqqqq'),
                  'B' : list('rrssrrss'),
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})

print(df)









    



   A  B         C         D
0  p  r -0.035902 -1.610307
1  p  r  0.817545  1.032971
2  p  s  0.874735 -0.811189
3  p  s -0.480971 -1.050927
4  q  r  0.724361  1.857076
5  q  r -0.093597 -0.479472
6  q  s -0.199158 -0.220293
7  q  s -0.082610 -0.028525



In [80]:

    
print(df.groupby(['A', 'B']).agg(
                {'C': {'C_mean': 'mean'},
                 'D': {'D_median': 'median'}
                  }))









    



            C         D
       C_mean  D_median
A B                    
p r  0.390822 -0.288668
  s  0.196882 -0.931058
q r  0.315382  0.688802
  s -0.140884 -0.124409






    



C:\Anaconda\lib\site-packages\pandas\core\groupby.py:3961: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)

Import and Export



In [81]:

    
df = pd.DataFrame([[11, 202],
                  [33, 44]],
                  index = list('AB'),
                   columns = list('CD'))

Writing to excel file



In [82]:

    
df.to_excel('pd_df.xlsx', sheet_name = 'Sheet1')

Reading from excel file



In [83]:

    
print(pd.read_excel('pd_df.xlsx', 'Sheet1'))